#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

static void _calc_bs(double sigma, double& b0, double& b1, double& b2, double& b3, double& B) {
    double q = 0.1147705018520355224609375;
    if (sigma >= 2.5) {
        q = 0.98711 * sigma - 0.96330;
    } else if (sigma >= 0.5 && sigma < 2.5) {
        q = 3.97156 - 4.14554 * std::sqrt(1. - 0.26891 * sigma);
    }
    double q2 = q * q;
    double q3 = q2 * q;
    b0 = 1.57825 + (2.44413 * q) + (1.4281 * q2) + 0.422205 * q3;
    b1 = (2.44413 * q) + (2.85619 * q2) + 1.26661 * q3;
    b2 = -((1.4281 * q2) + 1.26661 * q3);
    b3 =  0.422205 * q3;
    B = 1.0 - (b1+b2+b3) / b0;
}

static void _gaussian_blur_1d(const double* in, double* out, int size, int stride, double b0, double b1, double b2, double b3, double B) {
    double* w = (double*)malloc(sizeof(double) * (size+3));
    double* w_tmp = (double*)malloc(sizeof(double) * (size+3));
    w[0] = w[1] = w[2] = in[0];
    
    // forward
    for (int n = 3; n < size+3; n++) {
        double in_n = in[(n-3)*stride];
        w[n] = B*in_n + (b1*w[n-1]+b2*w[n-2]+b3*w[n-3])/b0;
    }
    
    //backward
    w_tmp[size] = w_tmp[size+1] = w_tmp[size+2] = w[size+2];
    for (int n = size-1; n>=0; n--) {
        w_tmp[n] = B*w[n]+(b1*w_tmp[n+1]+b2*w_tmp[n+2]+b3*w_tmp[n+3])/b0;
        out[n*stride] = w_tmp[n];
    }
    
    free(w);
    free(w_tmp);
}

static void _gaussian_blur_2d(const double* pIn, double* pOut, int rows, int cols, double b0, double b1, double b2, double b3, double B) {
    for (int y = 0; y < rows; y++) {
        _gaussian_blur_1d(pIn + y * cols, pOut + y * cols, cols, 1, b0, b1, b2, b3, B);
    }
    for (int x = 0; x < cols ;x++) {
        _gaussian_blur_1d(pOut+x, pOut+x, rows, cols, b0, b1, b2, b3, B);
    }
}


int main(int argc, char **argv) {
    cv::Mat src = cv::imread("/home/xlll/Downloads/opencv/samples/data/lena.jpg", cv::IMREAD_GRAYSCALE);
    if (src.empty()) {
        std::cout << "failed to read image" << std::endl;
        return EXIT_FAILURE;
    }
    
    double sigma = 3.1;
    cv::Mat dst1;
    
    clock_t start, finish;
    start = clock();
    for (int i = 0; i < 100; i++) {
        cv::GaussianBlur(src, dst1, cv::Size(201, 201), sigma, sigma);
    }
    finish = clock();
    std::cout << "GaussianBlur time = " << (double)(finish-start) / CLOCKS_PER_SEC << std::endl;
    
    double b0, b1, b2, b3, B;
    _calc_bs(sigma, b0, b1, b2, b3, B);
    cv::Mat dst2(src.size(), src.type());
    
    double* pDst2 = (double*)malloc(sizeof(double)*src.rows*src.cols);
    cv::Mat dst2_double(src.size(), CV_64FC1, pDst2);
    double* pSrc = (double*)malloc(sizeof(double)*src.rows*src.cols);
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            pSrc[i*src.cols+j] = src.ptr<uchar>(i)[j];
        }
    }
    start = clock();
    for (int i = 0; i < 100; i++) {
        _gaussian_blur_2d(pSrc, pDst2, src.rows,src.cols, b0, b1, b2, b3, B);
    }
    finish = clock();
    dst2_double.convertTo(dst2, CV_8UC1);
    std::cout << "_gaussian_blur_2d time = " << (double)(finish-start)/CLOCKS_PER_SEC << std::endl;
    
    cv::imshow("src", src);
    cv::imshow("dst1", dst1);
    cv::imshow("dst2", dst2);
    cv::waitKey(0);
    
    free(pDst2);
    
    return 0;
}
